library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.0     v forcats 0.5.1
## v purrr   0.3.4
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%()       masks ggplot2::%+%()
## x psych::alpha()     masks ggplot2::alpha()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x dplyr::src()       masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(skimr)
## Warning: package 'skimr' was built under R version 4.0.5
library(purrr)
library(tidyr)
library(tidyverse)
dfTrain <- read.csv("D:\\RStudio\\621\\Baseball\\moneyball-training-data.csv", header=TRUE)
dfTrain2 <- dfTrain

Initial Exploration

dim(dfTrain)
## [1] 2276   17
summary(dfTrain)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 630.8   1st Qu.: 71.00   1st Qu.:1383   1st Qu.:208.0  
##  Median :1270.5   Median : 82.00   Median :1454   Median :238.0  
##  Mean   :1268.5   Mean   : 80.79   Mean   :1469   Mean   :241.2  
##  3rd Qu.:1915.5   3rd Qu.: 92.00   3rd Qu.:1537   3rd Qu.:273.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##                                                                  
##  TEAM_BATTING_3B  TEAM_BATTING_HR  TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 42.00   1st Qu.:451.0   1st Qu.: 548.0  
##  Median : 47.00   Median :102.00   Median :512.0   Median : 750.0  
##  Mean   : 55.25   Mean   : 99.61   Mean   :501.6   Mean   : 735.6  
##  3rd Qu.: 72.00   3rd Qu.:147.00   3rd Qu.:580.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.00   Max.   :878.0   Max.   :1399.0  
##                                                    NA's   :102     
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
##  Min.   :  0.0   Min.   :  0.0   Min.   :29.00    Min.   : 1137  
##  1st Qu.: 66.0   1st Qu.: 38.0   1st Qu.:50.50    1st Qu.: 1419  
##  Median :101.0   Median : 49.0   Median :58.00    Median : 1518  
##  Mean   :124.8   Mean   : 52.8   Mean   :59.36    Mean   : 1779  
##  3rd Qu.:156.0   3rd Qu.: 62.0   3rd Qu.:67.00    3rd Qu.: 1682  
##  Max.   :697.0   Max.   :201.0   Max.   :95.00    Max.   :30132  
##  NA's   :131     NA's   :772     NA's   :2085                    
##  TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E 
##  Min.   :  0.0    Min.   :   0.0   Min.   :    0.0   Min.   :  65.0  
##  1st Qu.: 50.0    1st Qu.: 476.0   1st Qu.:  615.0   1st Qu.: 127.0  
##  Median :107.0    Median : 536.5   Median :  813.5   Median : 159.0  
##  Mean   :105.7    Mean   : 553.0   Mean   :  817.7   Mean   : 246.5  
##  3rd Qu.:150.0    3rd Qu.: 611.0   3rd Qu.:  968.0   3rd Qu.: 249.2  
##  Max.   :343.0    Max.   :3645.0   Max.   :19278.0   Max.   :1898.0  
##                                    NA's   :102                       
##  TEAM_FIELDING_DP
##  Min.   : 52.0   
##  1st Qu.:131.0   
##  Median :149.0   
##  Mean   :146.4   
##  3rd Qu.:164.0   
##  Max.   :228.0   
##  NA's   :286
skim(dfTrain)
Data summary
Name dfTrain
Number of rows 2276
Number of columns 17
_______________________
Column type frequency:
numeric 17
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INDEX 0 1.00 1268.46 736.35 1 630.75 1270.5 1915.50 2535 ▇▇▇▇▇
TARGET_WINS 0 1.00 80.79 15.75 0 71.00 82.0 92.00 146 ▁▁▇▅▁
TEAM_BATTING_H 0 1.00 1469.27 144.59 891 1383.00 1454.0 1537.25 2554 ▁▇▂▁▁
TEAM_BATTING_2B 0 1.00 241.25 46.80 69 208.00 238.0 273.00 458 ▁▆▇▂▁
TEAM_BATTING_3B 0 1.00 55.25 27.94 0 34.00 47.0 72.00 223 ▇▇▂▁▁
TEAM_BATTING_HR 0 1.00 99.61 60.55 0 42.00 102.0 147.00 264 ▇▆▇▅▁
TEAM_BATTING_BB 0 1.00 501.56 122.67 0 451.00 512.0 580.00 878 ▁▁▇▇▁
TEAM_BATTING_SO 102 0.96 735.61 248.53 0 548.00 750.0 930.00 1399 ▁▆▇▇▁
TEAM_BASERUN_SB 131 0.94 124.76 87.79 0 66.00 101.0 156.00 697 ▇▃▁▁▁
TEAM_BASERUN_CS 772 0.66 52.80 22.96 0 38.00 49.0 62.00 201 ▃▇▁▁▁
TEAM_BATTING_HBP 2085 0.08 59.36 12.97 29 50.50 58.0 67.00 95 ▂▇▇▅▁
TEAM_PITCHING_H 0 1.00 1779.21 1406.84 1137 1419.00 1518.0 1682.50 30132 ▇▁▁▁▁
TEAM_PITCHING_HR 0 1.00 105.70 61.30 0 50.00 107.0 150.00 343 ▇▇▆▁▁
TEAM_PITCHING_BB 0 1.00 553.01 166.36 0 476.00 536.5 611.00 3645 ▇▁▁▁▁
TEAM_PITCHING_SO 102 0.96 817.73 553.09 0 615.00 813.5 968.00 19278 ▇▁▁▁▁
TEAM_FIELDING_E 0 1.00 246.48 227.77 65 127.00 159.0 249.25 1898 ▇▁▁▁▁
TEAM_FIELDING_DP 286 0.87 146.39 26.23 52 131.00 149.0 164.00 228 ▁▂▇▆▁
str(dfTrain)
## 'data.frame':    2276 obs. of  17 variables:
##  $ INDEX           : int  1 2 3 4 5 6 7 8 11 12 ...
##  $ TARGET_WINS     : int  39 70 86 70 82 75 80 85 86 76 ...
##  $ TEAM_BATTING_H  : int  1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
##  $ TEAM_BATTING_2B : int  194 219 232 209 186 200 179 171 197 213 ...
##  $ TEAM_BATTING_3B : int  39 22 35 38 27 36 54 37 40 18 ...
##  $ TEAM_BATTING_HR : int  13 190 137 96 102 92 122 115 114 96 ...
##  $ TEAM_BATTING_BB : int  143 685 602 451 472 443 525 456 447 441 ...
##  $ TEAM_BATTING_SO : int  842 1075 917 922 920 973 1062 1027 922 827 ...
##  $ TEAM_BASERUN_SB : int  NA 37 46 43 49 107 80 40 69 72 ...
##  $ TEAM_BASERUN_CS : int  NA 28 27 30 39 59 54 36 27 34 ...
##  $ TEAM_BATTING_HBP: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ TEAM_PITCHING_H : int  9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
##  $ TEAM_PITCHING_HR: int  84 191 137 97 102 92 122 116 114 96 ...
##  $ TEAM_PITCHING_BB: int  927 689 602 454 472 443 525 459 447 441 ...
##  $ TEAM_PITCHING_SO: int  5456 1082 917 928 920 973 1062 1033 922 827 ...
##  $ TEAM_FIELDING_E : int  1011 193 175 164 138 123 136 112 127 131 ...
##  $ TEAM_FIELDING_DP: int  NA 155 153 156 168 149 186 136 169 159 ...

Outlier Analysis and zeroes as nas

for(i in 2:ncol(dfTrain)) {                              
  print(ggplot(dfTrain, aes(dfTrain[ , i])) +
          coord_flip() +  
          xlab(colnames(dfTrain)[i])  +
          geom_boxplot())

  print(head(sort(dfTrain[,i])))
  
  print(tail(sort(dfTrain[,i])))

}

## [1]  0 12 14 17 21 22
## [1] 126 128 129 134 135 146

## [1]  891  992 1009 1116 1122 1137
## [1] 2305 2333 2343 2372 2496 2554

## [1]  69 112 113 118 123 125
## [1] 378 382 392 393 403 458

## [1]  0  0  8  9 11 12
## [1] 165 166 190 197 200 223

## [1] 0 0 0 0 0 0
## [1] 246 247 249 257 260 264

## [1]  0 12 29 34 45 45
## [1] 806 815 819 824 860 878
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).

## [1] 0 0 0 0 0 0
## [1] 1273 1303 1320 1326 1335 1399
## Warning: Removed 131 rows containing non-finite values (stat_boxplot).

## [1]  0  0 14 18 18 18
## [1] 558 562 567 632 654 697
## Warning: Removed 772 rows containing non-finite values (stat_boxplot).

## [1]  0  7 11 12 14 14
## [1] 171 186 193 200 200 201
## Warning: Removed 2085 rows containing non-finite values (stat_boxplot).

## [1] 29 29 30 35 35 35
## [1] 89 89 89 89 90 95

## [1] 1137 1168 1184 1187 1202 1202
## [1] 14749 16038 16871 20088 24057 30132

## [1] 0 0 0 0 0 0
## [1] 291 297 301 320 320 343

## [1]   0 119 124 131 140 144
## [1] 1750 2169 2396 2840 2876 3645
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).

## [1] 0 0 0 0 0 0
## [1]  2492  3450  4224  5456 12758 19278

## [1] 65 66 68 72 74 77
## [1] 1553 1567 1728 1740 1890 1898
## Warning: Removed 286 rows containing non-finite values (stat_boxplot).

## [1] 52 64 68 71 72 72
## [1] 215 215 218 219 225 228

There are 4 categories where 0s may be nas: Pitching and Batting HR and Pitching and batting SO. We look more closely at these categories:

dfTrain_ZeroAsNA <- dfTrain %>%
dplyr::select(TEAM_PITCHING_SO, TEAM_PITCHING_HR, TEAM_BATTING_SO, TEAM_BATTING_HR)

hist(dfTrain_ZeroAsNA)

Will do nothing with outliers or na as zero for now

Taking care of NA

Team_Batting_HPBA has too many so we remove it:

dfTrain2 <- dfTrain2 %>%
  dplyr::select(-TEAM_BATTING_HBP) 

Before we impute the values for NAs, we need to ensure there isn’t any kind of grouping effect for the records with NA. Fact that several columns have the same number of missings suggests there might be. So first we look to see if the missings are collinear:

dfTrain2 <- dfTrain2 %>%
  mutate(Missing_Flag = ifelse(is.na(TEAM_BATTING_SO),1,0))

dfTrain3 <- dfTrain2 %>%
  dplyr::filter(Missing_Flag == 0) %>%
  dplyr::select(TEAM_BATTING_SO, TEAM_PITCHING_SO, TEAM_BASERUN_CS, TEAM_BASERUN_SB) 
  

summary(dfTrain3)
##  TEAM_BATTING_SO  TEAM_PITCHING_SO  TEAM_BASERUN_CS TEAM_BASERUN_SB
##  Min.   :   0.0   Min.   :    0.0   Min.   :  0.0   Min.   :  0.0  
##  1st Qu.: 548.0   1st Qu.:  615.0   1st Qu.: 38.0   1st Qu.: 65.0  
##  Median : 750.0   Median :  813.5   Median : 49.0   Median : 98.0  
##  Mean   : 735.6   Mean   :  817.7   Mean   : 52.8   Mean   :120.8  
##  3rd Qu.: 930.0   3rd Qu.:  968.0   3rd Qu.: 62.0   3rd Qu.:147.0  
##  Max.   :1399.0   Max.   :19278.0   Max.   :201.0   Max.   :697.0  
##                                     NA's   :670     NA's   :131

There is some cohort effect as there is complete duplication with pitching so and batting so, and some overlap with baserun cs. Now lets impute the median and see how well the new modelperforms vs the old:

dfTrain_ImputedMedian <- data.frame(
    sapply(dfTrain2, function(x) ifelse(is.na(x), median(x, na.rm = TRUE), x)))

dfTrain_ImputedMean <- data.frame(
    sapply(dfTrain2, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x)))

m1 <- lm(TARGET_WINS ~ ., dfTrain2)
m2 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMedian)
m3 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMean)

summary(m1)$adj.r.squared
## [1] 0.4330872
summary(m2)$adj.r.squared
## [1] 0.313437
summary(m3)$adj.r.squared
## [1] 0.3169625

There appears to be a large effect.

Now we can look at interactions between the “cohort” and other variables:

par(mfcol=c(2,2))



dfTrain_ImputedMean$Missing_Flag <- as.factor(dfTrain_ImputedMean$Missing_Flag)

for(i in 2:ncol(dfTrain_ImputedMean)) {                              
  print(ggplot(dfTrain_ImputedMean, aes(dfTrain_ImputedMean[ ,i], TARGET_WINS, color=Missing_Flag)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  ggtitle(colnames(dfTrain_ImputedMean)[i]))
}
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

The interaction analysis suggests that the cohort is not random - there are numerous interactions with many other variables, some of which are quite counterinutitive (team pitching H). So we could either do a random effects/flag/interactions or toss them. Becuase bad data is not reproducible I will toss, at the expense of better predicitons if I can identify the cohort in the eval data.

dfTrain_ImputedMean_NoCohort <- dfTrain_ImputedMean %>%
  filter(Missing_Flag==0) %>%
  dplyr::select(-Missing_Flag)

summary(dfTrain_ImputedMean_NoCohort)
##      INDEX         TARGET_WINS     TEAM_BATTING_H TEAM_BATTING_2B
##  Min.   :   1.0   Min.   :  0.00   Min.   : 891   Min.   : 69.0  
##  1st Qu.: 640.2   1st Qu.: 71.00   1st Qu.:1389   1st Qu.:211.2  
##  Median :1275.5   Median : 82.00   Median :1458   Median :240.0  
##  Mean   :1275.2   Mean   : 80.76   Mean   :1475   Mean   :243.9  
##  3rd Qu.:1923.8   3rd Qu.: 91.00   3rd Qu.:1541   3rd Qu.:275.0  
##  Max.   :2535.0   Max.   :146.00   Max.   :2554   Max.   :458.0  
##  TEAM_BATTING_3B  TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO 
##  Min.   :  0.00   Min.   :  0.0   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.: 34.00   1st Qu.: 48.0   1st Qu.:456.0   1st Qu.: 548.0  
##  Median : 46.00   Median :107.0   Median :517.0   Median : 750.0  
##  Mean   : 54.45   Mean   :103.4   Mean   :505.1   Mean   : 735.6  
##  3rd Qu.: 71.00   3rd Qu.:148.0   3rd Qu.:582.0   3rd Qu.: 930.0  
##  Max.   :223.00   Max.   :264.0   Max.   :878.0   Max.   :1399.0  
##  TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_HR
##  Min.   :  0.0   Min.   :  0.0   Min.   : 1137   Min.   :  0.0   
##  1st Qu.: 66.0   1st Qu.: 44.0   1st Qu.: 1425   1st Qu.: 58.0   
##  Median :102.0   Median : 52.8   Median : 1521   Median :111.0   
##  Mean   :121.1   Mean   : 52.8   Mean   : 1794   Mean   :109.7   
##  3rd Qu.:143.8   3rd Qu.: 55.0   3rd Qu.: 1694   3rd Qu.:152.8   
##  Max.   :697.0   Max.   :201.0   Max.   :30132   Max.   :343.0   
##  TEAM_PITCHING_BB TEAM_PITCHING_SO  TEAM_FIELDING_E  TEAM_FIELDING_DP
##  Min.   :   0.0   Min.   :    0.0   Min.   :  65.0   Min.   : 52.0   
##  1st Qu.: 479.2   1st Qu.:  615.0   1st Qu.: 126.0   1st Qu.:137.0   
##  Median : 542.0   Median :  813.5   Median : 155.0   Median :146.4   
##  Mean   : 557.5   Mean   :  817.7   Mean   : 243.9   Mean   :148.6   
##  3rd Qu.: 614.8   3rd Qu.:  968.0   3rd Qu.: 234.0   3rd Qu.:162.0   
##  Max.   :3645.0   Max.   :19278.0   Max.   :1898.0   Max.   :228.0

Curious to look at impact of imputing median on correlation:

summary(lm(dfTrain$TARGET_WINS ~ dfTrain$TEAM_PITCHING_SO))
## 
## Call:
## lm(formula = dfTrain$TARGET_WINS ~ dfTrain$TEAM_PITCHING_SO)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.570  -9.402   0.970  10.484  63.430 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              82.5704787  0.5945630 138.876  < 2e-16 ***
## dfTrain$TEAM_PITCHING_SO -0.0022085  0.0006023  -3.667 0.000252 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.53 on 2172 degrees of freedom
##   (102 observations deleted due to missingness)
## Multiple R-squared:  0.006152,   Adjusted R-squared:  0.005695 
## F-statistic: 13.45 on 1 and 2172 DF,  p-value: 0.0002515
summary(lm(dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian$TEAM_PITCHING_SO))
## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian$TEAM_PITCHING_SO)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.597  -9.554   0.980  10.633  63.403 
## 
## Coefficients:
##                                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            82.5967253  0.5971665 138.314  < 2e-16
## dfTrain_ImputedMedian$TEAM_PITCHING_SO -0.0022089  0.0006093  -3.625 0.000295
##                                           
## (Intercept)                            ***
## dfTrain_ImputedMedian$TEAM_PITCHING_SO ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.71 on 2274 degrees of freedom
## Multiple R-squared:  0.005746,   Adjusted R-squared:  0.005308 
## F-statistic: 13.14 on 1 and 2274 DF,  p-value: 0.0002953

Th effect is minimal.

dfTrain %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3478 rows containing non-finite values (stat_bin).

dfTrain %>%
  keep(is.numeric) %>% 
  gather() %>% 
  ggplot(aes(value)) +
    facet_wrap(~ key, scales = "free") +
    geom_boxplot() +
  coord_flip()
## Warning: Removed 3478 rows containing non-finite values (stat_boxplot).

Outlier analysis again

Outlier Analysis and zeroes as nas

for(i in 2:ncol(dfTrain_ImputedMean_NoCohort)) {                              
  print(ggplot(dfTrain_ImputedMean_NoCohort, aes(dfTrain_ImputedMean_NoCohort[ , i])) +
          coord_flip() +  
          xlab(colnames(dfTrain_ImputedMean_NoCohort)[i])  +
          geom_boxplot())

  print(head(sort(dfTrain_ImputedMean_NoCohort[,i])))
  
  print(tail(sort(dfTrain_ImputedMean_NoCohort[,i])))

}

## [1]  0 12 14 17 21 22
## [1] 126 128 129 134 135 146

## [1]  891  992 1009 1116 1122 1137
## [1] 2305 2333 2343 2372 2496 2554

## [1]  69 112 113 118 127 130
## [1] 378 382 392 393 403 458

## [1]  0  0  8  9 11 12
## [1] 165 166 190 197 200 223

## [1] 0 0 0 0 0 0
## [1] 246 247 249 257 260 264

## [1]  0 12 29 34 45 45
## [1] 806 815 819 824 860 878

## [1] 0 0 0 0 0 0
## [1] 1273 1303 1320 1326 1335 1399

## [1]  0  0 14 18 18 18
## [1] 558 562 567 632 654 697

## [1]  0  7 11 12 14 14
## [1] 171 186 193 200 200 201

## [1] 1137 1168 1184 1187 1202 1202
## [1] 14749 16038 16871 20088 24057 30132

## [1] 0 0 0 0 0 0
## [1] 291 297 301 320 320 343

## [1]   0 119 124 131 140 144
## [1] 1750 2169 2396 2840 2876 3645

## [1] 0 0 0 0 0 0
## [1]  2492  3450  4224  5456 12758 19278

## [1] 65 66 68 72 74 77
## [1] 1553 1567 1728 1740 1890 1898

## [1] 52 64 71 72 75 78
## [1] 215 215 218 219 225 228

Second Exploration

a. Dependent variable

hist(dfTrain$TARGET_WINS, bins=20)
## Warning in plot.window(xlim, ylim, "", ...): "bins" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "bins"
## is not a graphical parameter
## Warning in axis(1, ...): "bins" is not a graphical parameter
## Warning in axis(2, ...): "bins" is not a graphical parameter

head(sort(dfTrain$TARGET_WINS))
## [1]  0 12 14 17 21 22
dfTrain_ZeroWins <- dfTrain %>%
  dplyr::filter(TARGET_WINS ==0)

head(dfTrain_ZeroWins, 1)
##   INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1  1347           0            891             135               0
##   TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1               0               0               0               0
##   TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1               0               NA           24057                0
##   TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1                0                0            1890               NA

Target_Wins appears normally distributed - the zero is suspicious but I’m going to leave it.

b. Look at correlations throughout the variables and inspect multi-colinnearity

dfCor <- as.data.frame(cor(dfTrain_ImputedMean_NoCohort))
dfCor
##                          INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX             1.0000000000 -0.02928140    -0.03131390    -0.003976934
## TARGET_WINS      -0.0292813985  1.00000000     0.39476995     0.293205037
## TEAM_BATTING_H   -0.0313139014  0.39476995     1.00000000     0.540648272
## TEAM_BATTING_2B  -0.0039769341  0.29320504     0.54064827     1.000000000
## TEAM_BATTING_3B  -0.0049758496  0.13685882     0.45802046    -0.085325497
## TEAM_BATTING_HR   0.0413809930  0.19059035    -0.06194956     0.393641975
## TEAM_BATTING_BB  -0.0358540809  0.23250609    -0.10545406     0.230196649
## TEAM_BATTING_SO   0.0814501106 -0.03175071    -0.46385357     0.162685188
## TEAM_BASERUN_SB   0.0435154747  0.11143414     0.14886129    -0.153728585
## TEAM_BASERUN_CS   0.0004632733  0.01610843     0.01198251    -0.077632602
## TEAM_PITCHING_H   0.0146890757 -0.11576530     0.29979491     0.008872511
## TEAM_PITCHING_HR  0.0403725584  0.20531868     0.02082589     0.412455481
## TEAM_PITCHING_BB -0.0233549401  0.12063924     0.07067846     0.149565361
## TEAM_PITCHING_SO  0.0558901457 -0.07843609    -0.25265679     0.064792315
## TEAM_FIELDING_E  -0.0068738726 -0.17639551     0.28252119    -0.232247607
## TEAM_FIELDING_DP  0.0061318975 -0.02860414     0.04535652     0.178563220
##                  TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## INDEX                -0.00497585      0.04138099     -0.03585408
## TARGET_WINS           0.13685882      0.19059035      0.23250609
## TEAM_BATTING_H        0.45802046     -0.06194956     -0.10545406
## TEAM_BATTING_2B      -0.08532550      0.39364197      0.23019665
## TEAM_BATTING_3B       1.00000000     -0.63765753     -0.28160593
## TEAM_BATTING_HR      -0.63765753      1.00000000      0.50439692
## TEAM_BATTING_BB      -0.28160593      0.50439692      1.00000000
## TEAM_BATTING_SO      -0.66978119      0.72706935      0.37975087
## TEAM_BASERUN_SB       0.49301668     -0.39942181     -0.06545891
## TEAM_BASERUN_CS       0.19833581     -0.30347433     -0.08612025
## TEAM_PITCHING_H       0.20396690     -0.27656010     -0.46585690
## TEAM_PITCHING_HR     -0.56629509      0.96659392      0.44681242
## TEAM_PITCHING_BB      0.01294580      0.10677385      0.47385394
## TEAM_PITCHING_SO     -0.25881893      0.18470756     -0.02075682
## TEAM_FIELDING_E       0.51354615     -0.59891151     -0.66138116
## TEAM_FIELDING_DP     -0.21908499      0.33368751      0.32158157
##                  TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## INDEX                 0.08145011      0.04351547    0.0004632733
## TARGET_WINS          -0.03175071      0.11143414    0.0161084320
## TEAM_BATTING_H       -0.46385357      0.14886129    0.0119825143
## TEAM_BATTING_2B       0.16268519     -0.15372858   -0.0776326024
## TEAM_BATTING_3B      -0.66978119      0.49301668    0.1983358054
## TEAM_BATTING_HR       0.72706935     -0.39942181   -0.3034743273
## TEAM_BATTING_BB       0.37975087     -0.06545891   -0.0861202523
## TEAM_BATTING_SO       1.00000000     -0.23837153   -0.1566149092
## TEAM_BASERUN_SB      -0.23837153      1.00000000    0.2869124889
## TEAM_BASERUN_CS      -0.15661491      0.28691249    1.0000000000
## TEAM_PITCHING_H      -0.37568637      0.07198568   -0.0369545996
## TEAM_PITCHING_HR      0.66717889     -0.36564098   -0.3034478040
## TEAM_PITCHING_BB      0.03700514      0.14323815   -0.0542531880
## TEAM_PITCHING_SO      0.41623330     -0.05615058   -0.0686217842
## TEAM_FIELDING_E      -0.58466444      0.36999309    0.0236201201
## TEAM_FIELDING_DP      0.14599850     -0.24957358   -0.1563091914
##                  TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## INDEX                0.014689076       0.04037256      -0.02335494
## TARGET_WINS         -0.115765302       0.20531868       0.12063924
## TEAM_BATTING_H       0.299794910       0.02082589       0.07067846
## TEAM_BATTING_2B      0.008872511       0.41245548       0.14956536
## TEAM_BATTING_3B      0.203966905      -0.56629509       0.01294580
## TEAM_BATTING_HR     -0.276560100       0.96659392       0.10677385
## TEAM_BATTING_BB     -0.465856896       0.44681242       0.47385394
## TEAM_BATTING_SO     -0.375686369       0.66717889       0.03700514
## TEAM_BASERUN_SB      0.071985680      -0.36564098       0.14323815
## TEAM_BASERUN_CS     -0.036954600      -0.30344780      -0.05425319
## TEAM_PITCHING_H      1.000000000      -0.16448724       0.31845282
## TEAM_PITCHING_HR    -0.164487236       1.00000000       0.19575531
## TEAM_PITCHING_BB     0.318452818       0.19575531       1.00000000
## TEAM_PITCHING_SO     0.267248074       0.20588053       0.48849865
## TEAM_FIELDING_E      0.672838853      -0.50175814      -0.01637592
## TEAM_FIELDING_DP    -0.088957308       0.32336753       0.15211734
##                  TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## INDEX                  0.05589015    -0.006873873      0.006131897
## TARGET_WINS           -0.07843609    -0.176395507     -0.028604138
## TEAM_BATTING_H        -0.25265679     0.282521195      0.045356517
## TEAM_BATTING_2B        0.06479231    -0.232247607      0.178563220
## TEAM_BATTING_3B       -0.25881893     0.513546149     -0.219084985
## TEAM_BATTING_HR        0.18470756    -0.598911507      0.333687510
## TEAM_BATTING_BB       -0.02075682    -0.661381160      0.321581568
## TEAM_BATTING_SO        0.41623330    -0.584664436      0.145998500
## TEAM_BASERUN_SB       -0.05615058     0.369993094     -0.249573580
## TEAM_BASERUN_CS       -0.06862178     0.023620120     -0.156309191
## TEAM_PITCHING_H        0.26724807     0.672838853     -0.088957308
## TEAM_PITCHING_HR       0.20588053    -0.501758136      0.323367525
## TEAM_PITCHING_BB       0.48849865    -0.016375919      0.152117341
## TEAM_PITCHING_SO       1.00000000    -0.023291783      0.010392318
## TEAM_FIELDING_E       -0.02329178     1.000000000     -0.257897297
## TEAM_FIELDING_DP       0.01039232    -0.257897297      1.000000000
heatmap(as.matrix(dfCor), Rowv = NA, Colv = NA)   

Invsteigate suspicious HR category

cor.test(dfTrain$TEAM_PITCHING_HR, dfTrain$TARGET_WINS)
## 
##  Pearson's product-moment correlation
## 
## data:  dfTrain$TEAM_PITCHING_HR and dfTrain$TARGET_WINS
## t = 9.1789, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1490846 0.2283275
## sample estimates:
##       cor 
## 0.1890137
ggplot(dfTrain, aes(TEAM_PITCHING_HR, TEAM_BATTING_HR, color=INDEX)) +
  geom_point()

hist(dfTrain$TEAM_PITCHING_HR, breaks=100)

plot(dfTrain$TEAM_PITCHING_HR, dfTrain$TARGET_WINS)

m1 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfTrain)
summary(m1)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.657  -9.956   0.636  10.055  67.477 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      75.656920   0.646540 117.018   <2e-16 ***
## TEAM_PITCHING_HR  0.048572   0.005292   9.179   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2274 degrees of freedom
## Multiple R-squared:  0.03573,    Adjusted R-squared:  0.0353 
## F-statistic: 84.25 on 1 and 2274 DF,  p-value: < 2.2e-16
plot(m1)

library(car) 
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:psych':
## 
##     logit
influencePlot(m1, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##        StudRes          Hat        CookD
## 299   4.380293 0.0006944747 0.0066141630
## 832   0.173993 0.0070267976 0.0001071615
## 964  -1.050146 0.0058117315 0.0032231964
## 1211 -4.919225 0.0017463018 0.0209523937
## 2233 -4.132563 0.0017463018 0.0148329515
dfTrain2 <- dfTrain[-c(1211,2233,299,1825, 832), ]
cor.test(dfTrain2$TEAM_PITCHING_HR, dfTrain2$TARGET_WINS)
## 
##  Pearson's product-moment correlation
## 
## data:  dfTrain2$TEAM_PITCHING_HR and dfTrain2$TARGET_WINS
## t = 8.8525, df = 2269, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1426547 0.2221771
## sample estimates:
##       cor 
## 0.1827147
m2 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfTrain2)
summary(m2)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfTrain2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.949  -9.929   0.614  10.028  55.992 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      75.948820   0.639361 118.789   <2e-16 ***
## TEAM_PITCHING_HR  0.046356   0.005237   8.852   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.23 on 2269 degrees of freedom
## Multiple R-squared:  0.03338,    Adjusted R-squared:  0.03296 
## F-statistic: 78.37 on 1 and 2269 DF,  p-value: < 2.2e-16
plot(m2)

library(car) 
influencePlot(m2, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##        StudRes          Hat       CookD
## 964  -1.039523 0.0058683344 0.003189286
## 982  -3.886600 0.0017628831 0.013255859
## 1810  2.114722 0.0049482791 0.011102505
## 1882 -1.303158 0.0058683344 0.005010737
## 2012  3.688318 0.0006272236 0.004245374
summary(m1$residuals)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -75.6569  -9.9562   0.6359   0.0000  10.0552  67.4774
describe(m1$residuals)
##    vars    n mean    sd median trimmed   mad    min   max  range  skew kurtosis
## X1    1 2276    0 15.47   0.64     0.2 14.84 -75.66 67.48 143.13 -0.18     0.86
##      se
## X1 0.32
dfTrain$Residuals <- m1$residuals
dfTrain$Fitted <- m1$fitted.values
library(tidyverse)
dfTrain_WithoutHR <- dfTrain %>%
  dplyr::filter(TARGET_WINS >=50 | TEAM_PITCHING_HR!=0)

hist(dfTrain_WithoutHR$TEAM_PITCHING_HR)

plot(dfTrain_WithoutHR$TEAM_PITCHING_HR, dfTrain_WithoutHR$TARGET_WINS)

m3 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfTrain_WithoutHR)
summary(m3)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfTrain_WithoutHR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.208  -9.802   0.653   9.952  66.914 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      76.624136   0.636539 120.376  < 2e-16 ***
## TEAM_PITCHING_HR  0.041723   0.005197   8.028 1.58e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.09 on 2263 degrees of freedom
## Multiple R-squared:  0.02769,    Adjusted R-squared:  0.02726 
## F-statistic: 64.45 on 1 and 2263 DF,  p-value: 1.576e-15
plot(m3)

library(car) 
influencePlot(m3, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##         StudRes          Hat        CookD
## 299   4.4557422 0.0007060697 0.0069560265
## 829   0.2703629 0.0070966028 0.0002613277
## 856  -3.7394216 0.0014507753 0.0101000850
## 961  -0.9956483 0.0058665293 0.0029249611
## 1804  2.1826581 0.0049451007 0.0118181032
dfTrain_BiModal <- dfTrain %>%
  mutate(HR_Low = if_else(TEAM_PITCHING_HR<50,1,0)) %>%
  mutate(HR_High = if_else(TEAM_PITCHING_HR>=50,1,0))

dfCor_BiModal <- as.data.frame(cor(dfTrain_BiModal))
m4 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR + HR_Low, data=dfTrain_BiModal)
summary(m4)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR + HR_Low, data = dfTrain_BiModal)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.692  -9.976   0.653  10.058  67.556 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      75.529253   1.069339  70.632  < 2e-16 ***
## TEAM_PITCHING_HR  0.049398   0.007641   6.465 1.24e-10 ***
## HR_Low            0.162504   1.084033   0.150    0.881    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2273 degrees of freedom
## Multiple R-squared:  0.03574,    Adjusted R-squared:  0.03489 
## F-statistic: 42.12 on 2 and 2273 DF,  p-value: < 2.2e-16
plot(m4)

dfHighHR <- dfTrain_BiModal %>%
  dplyr::filter(HR_High ==1)

dfLowHR <- dfTrain_BiModal %>%
  dplyr::filter(HR_Low==1)

t.test(dfLowHR$TARGET_WINS, dfHighHR$TARGET_WINS)
## 
##  Welch Two Sample t-test
## 
## data:  dfLowHR$TARGET_WINS and dfHighHR$TARGET_WINS
## t = -5.4141, df = 753, p-value = 8.291e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.665804 -3.118167
## sample estimates:
## mean of x mean of y 
##  77.11327  82.00526
m5 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfHighHR)
summary(m5)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfHighHR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.641  -9.293   0.650   9.127  67.238 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      76.107959   0.957161  79.514  < 2e-16 ***
## TEAM_PITCHING_HR  0.044983   0.006848   6.569 6.72e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.73 on 1709 degrees of freedom
## Multiple R-squared:  0.02463,    Adjusted R-squared:  0.02405 
## F-statistic: 43.15 on 1 and 1709 DF,  p-value: 6.72e-11
plot(m5)

dfCor_HR <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$TEAM_PITCHING_HR)) 
dfCor_Low <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$HR_Low))

plot(dfTrain$TEAM_BATTING_HR, dfTrain$TEAM_PITCHING_HR)

dfTrain$HR_Diff <- dfTrain$TEAM_PITCHING_HR -dfTrain$TEAM_BATTING_HR
hist(dfTrain$HR_Diff, breaks=100)

describe(dfTrain$HR_Diff)
##    vars    n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 2276 6.09 15.1      2    2.93 2.97  -2 249   251 6.98    71.83 0.32

Sum of HR allowed greatly exceeds sum of HR hit

m6 <- lm(dfTrain$TEAM_BATTING_HR ~ dfTrain$TEAM_PITCHING_HR)
summary(m6)
## 
## Call:
## lm(formula = dfTrain$TEAM_BATTING_HR ~ dfTrain$TEAM_PITCHING_HR)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -234.609    0.123    1.336    6.992   12.817 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -1.592392   0.621547  -2.562   0.0105 *  
## dfTrain$TEAM_PITCHING_HR  0.957481   0.005087 188.217   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.87 on 2274 degrees of freedom
## Multiple R-squared:  0.9397, Adjusted R-squared:  0.9397 
## F-statistic: 3.543e+04 on 1 and 2274 DF,  p-value: < 2.2e-16
plot(m6)

cor.test(dfTrain$TEAM_BATTING_BB, dfTrain$TEAM_PITCHING_BB)
## 
##  Pearson's product-moment correlation
## 
## data:  dfTrain$TEAM_BATTING_BB and dfTrain$TEAM_PITCHING_BB
## t = 26.759, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4574724 0.5199930
## sample estimates:
##       cor 
## 0.4893613
plot(dfTrain$TEAM_BATTING_BB, dfTrain$TEAM_PITCHING_BB)

  1. look at relationships with Dependent variable
dfTrain_ImputedMedian <- dfTrain_ImputedMean_NoCohort

for(i in 2:ncol(dfTrain_ImputedMedian)) {                              
  print(ggplot(dfTrain_ImputedMedian, aes(dfTrain_ImputedMedian[ , i], x = dfTrain_ImputedMedian$TARGET_WINS)) +
          xlab(colnames(dfTrain)[i])  +
          stat_smooth(method=loess) +
          geom_point())

m <- lm(dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,i])

par(mfcol=c(2,2))
print(summary(m))
print(plot(m))
}
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning in summary.lm(m): essentially perfect fit: summary may be unreliable

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.647e-14 -1.120e-15 -7.000e-16 -2.800e-16  1.614e-12 
## 
## Coefficients:
##                             Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)                1.756e-13  3.928e-15 4.470e+01   <2e-16 ***
## dfTrain_ImputedMedian[, i] 1.000e+00  4.775e-17 2.094e+16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.467e-14 on 2172 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 4.385e+32 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.761  -8.515   0.971   9.783  43.230 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                17.686332   3.164963   5.588 2.58e-08 ***
## dfTrain_ImputedMedian[, i]  0.042775   0.002136  20.025  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.31 on 2172 degrees of freedom
## Multiple R-squared:  0.1558, Adjusted R-squared:  0.1555 
## F-statistic:   401 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.863  -9.376   0.670  10.121  57.415 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                56.346919   1.737969   32.42   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.100118   0.007005   14.29   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.89 on 2172 degrees of freedom
## Multiple R-squared:  0.08597,    Adjusted R-squared:  0.08555 
## F-statistic: 204.3 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -76.628  -8.980   1.143  10.428  60.940 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                76.62804    0.72265 106.038  < 2e-16 ***
## dfTrain_ImputedMedian[, i]  0.07596    0.01180   6.439 1.48e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.43 on 2172 degrees of freedom
## Multiple R-squared:  0.01873,    Adjusted R-squared:  0.01828 
## F-statistic: 41.46 on 1 and 2172 DF,  p-value: 1.477e-10

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -75.596  -9.734   0.553  10.041  68.954 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                75.595947   0.658670 114.771   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.050009   0.005527   9.048   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.29 on 2172 degrees of freedom
## Multiple R-squared:  0.03632,    Adjusted R-squared:  0.03588 
## F-statistic: 81.87 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -65.936  -9.554   0.579   9.674  78.185 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                65.935670   1.370076   48.13   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.029358   0.002635   11.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.15 on 2172 degrees of freedom
## Multiple R-squared:  0.05406,    Adjusted R-squared:  0.05362 
## F-statistic: 124.1 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.228  -9.308   0.963  10.609  63.772 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                82.228036   1.043434   78.81   <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.001990   0.001344   -1.48    0.139    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.001008,   Adjusted R-squared:  0.0005482 
## F-statistic: 2.192 on 1 and 2172 DF,  p-value: 0.1389

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.284  -9.080   1.024  10.198  65.160 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                78.28444    0.57917 135.166  < 2e-16 ***
## dfTrain_ImputedMedian[, i]  0.02048    0.00392   5.226  1.9e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.48 on 2172 degrees of freedom
## Multiple R-squared:  0.01242,    Adjusted R-squared:  0.01196 
## F-statistic: 27.31 on 1 and 2172 DF,  p-value: 1.899e-07

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.071  -9.493   1.233  10.483  65.236 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                80.07067    0.98260  81.489   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.01314    0.01750   0.751    0.453    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.0002595,  Adjusted R-squared:  -0.0002008 
## F-statistic: 0.5637 on 1 and 2172 DF,  p-value: 0.4528

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.165  -9.462   0.897  10.651  68.914 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                83.0150688  0.5308401 156.384  < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0012543  0.0002309  -5.432  6.2e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared:  0.0134, Adjusted R-squared:  0.01295 
## F-statistic:  29.5 on 1 and 2172 DF,  p-value: 6.205e-08

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.906  -9.846   0.705   9.965  67.942 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                74.905514   0.682649 109.728   <2e-16 ***
## dfTrain_ImputedMedian[, i]  0.053432   0.005465   9.777   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.25 on 2172 degrees of freedom
## Multiple R-squared:  0.04216,    Adjusted R-squared:  0.04171 
## F-statistic: 95.59 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -74.528  -9.251   0.948  10.415  70.006 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                74.528116   1.149967  64.809  < 2e-16 ***
## dfTrain_ImputedMedian[, i]  0.011187   0.001975   5.664 1.68e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2172 degrees of freedom
## Multiple R-squared:  0.01455,    Adjusted R-squared:  0.0141 
## F-statistic: 32.08 on 1 and 2172 DF,  p-value: 1.678e-08

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -82.570  -9.402   0.970  10.484  63.430 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                82.5704787  0.5945630 138.876  < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0022085  0.0006023  -3.667 0.000252 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.53 on 2172 degrees of freedom
## Multiple R-squared:  0.006152,   Adjusted R-squared:  0.005695 
## F-statistic: 13.45 on 1 and 2172 DF,  p-value: 0.0002515

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.638  -9.847   0.708  10.050  73.590 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                83.645750   0.476605 175.503   <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.011815   0.001415  -8.352   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared:  0.03112,    Adjusted R-squared:  0.03067 
## F-statistic: 69.75 on 1 and 2172 DF,  p-value: < 2.2e-16

## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.

## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'

## 
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[, 
##     i])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.809  -9.322   1.075  10.459  65.191 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                83.70498    2.23001  37.536   <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.01979    0.01484  -1.334    0.182    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.0008182,  Adjusted R-squared:  0.0003582 
## F-statistic: 1.779 on 1 and 2172 DF,  p-value: 0.1825

## NULL

Trying a transformation on team fielding error. it improves it to some degree.

dfTrain_ImputedMedian2 <- dfTrain_ImputedMedian %>%
  mutate(sq = TEAM_FIELDING_E^2)

summary(lm(TARGET_WINS ~ TEAM_FIELDING_E, dfTrain_ImputedMedian2))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E, data = dfTrain_ImputedMedian2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.638  -9.847   0.708  10.050  73.590 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     83.645750   0.476605 175.503   <2e-16 ***
## TEAM_FIELDING_E -0.011815   0.001415  -8.352   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared:  0.03112,    Adjusted R-squared:  0.03067 
## F-statistic: 69.75 on 1 and 2172 DF,  p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ TEAM_FIELDING_E + sq, dfTrain_ImputedMedian2))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E + sq, data = dfTrain_ImputedMedian2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.981  -9.787   0.647  10.285  72.647 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.985e+01  7.178e-01 111.246  < 2e-16 ***
## TEAM_FIELDING_E  1.386e-02  3.924e-03   3.533 0.000419 ***
## sq              -2.177e-05  3.108e-06  -7.005 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.17 on 2171 degrees of freedom
## Multiple R-squared:  0.05253,    Adjusted R-squared:  0.05165 
## F-statistic: 60.18 on 2 and 2171 DF,  p-value: < 2.2e-16

Regression

#Two mods made - team pitching has the square temr and intreaction between hits and dp

par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
summary(mod_2)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.264  -8.466   0.163   8.273  58.924 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      23.9560970  5.4876280   4.365 1.33e-05 ***
## INDEX            -0.0004771  0.0003788  -1.259 0.207988    
## TEAM_BATTING_H    0.0482928  0.0037112  13.013  < 2e-16 ***
## TEAM_BATTING_2B  -0.0232530  0.0092311  -2.519 0.011841 *  
## TEAM_BATTING_3B   0.0595670  0.0169134   3.522 0.000437 ***
## TEAM_BATTING_HR   0.0655424  0.0272468   2.406 0.016234 *  
## TEAM_BATTING_BB   0.0084691  0.0057882   1.463 0.143567    
## TEAM_BATTING_SO  -0.0100510  0.0025721  -3.908 9.61e-05 ***
## TEAM_BASERUN_SB   0.0254437  0.0044746   5.686 1.47e-08 ***
## TEAM_BASERUN_CS   0.0006521  0.0161429   0.040 0.967780    
## TEAM_PITCHING_H  -0.0009865  0.0003651  -2.702 0.006949 ** 
## TEAM_PITCHING_HR  0.0116273  0.0240289   0.484 0.628514    
## TEAM_PITCHING_BB  0.0014808  0.0040999   0.361 0.718000    
## TEAM_PITCHING_SO  0.0028141  0.0009069   3.103 0.001941 ** 
## TEAM_FIELDING_E  -0.0186779  0.0024906  -7.499 9.31e-14 ***
## TEAM_FIELDING_DP -0.1091373  0.0136377  -8.003 1.97e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.86 on 2158 degrees of freedom
## Multiple R-squared:  0.3226, Adjusted R-squared:  0.3179 
## F-statistic:  68.5 on 15 and 2158 DF,  p-value: < 2.2e-16
plot(mod_2)

library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E + 
##     TEAM_FIELDING_DP, data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.153  -8.411   0.176   8.307  58.465 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      22.6861348  5.2806294   4.296 1.82e-05 ***
## TEAM_BATTING_H    0.0486089  0.0036841  13.194  < 2e-16 ***
## TEAM_BATTING_2B  -0.0233877  0.0092203  -2.537 0.011265 *  
## TEAM_BATTING_3B   0.0602198  0.0166990   3.606 0.000318 ***
## TEAM_BATTING_HR   0.0770786  0.0097715   7.888 4.83e-15 ***
## TEAM_BATTING_BB   0.0104799  0.0033563   3.122 0.001817 ** 
## TEAM_BATTING_SO  -0.0104007  0.0024834  -4.188 2.93e-05 ***
## TEAM_BASERUN_SB   0.0253857  0.0042813   5.929 3.53e-09 ***
## TEAM_PITCHING_H  -0.0008928  0.0003178  -2.809 0.005008 ** 
## TEAM_PITCHING_SO  0.0030690  0.0006625   4.633 3.82e-06 ***
## TEAM_FIELDING_E  -0.0184139  0.0024107  -7.639 3.28e-14 ***
## TEAM_FIELDING_DP -0.1095211  0.0136173  -8.043 1.43e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.86 on 2162 degrees of freedom
## Multiple R-squared:  0.3218, Adjusted R-squared:  0.3184 
## F-statistic: 93.27 on 11 and 2162 DF,  p-value: < 2.2e-16

Understanding the role of double plays - remove the influence of hits:

ggplot(dfTrain_ImputedMedian, aes(TEAM_FIELDING_DP, TEAM_PITCHING_H)) +
  geom_point()

ggplot(dfTrain, aes(TEAM_FIELDING_DP, TEAM_PITCHING_H)) +
  geom_point()
## Warning: Removed 286 rows containing missing values (geom_point).

cor(dfTrain_ImputedMedian$TEAM_FIELDING_DP, dfTrain_ImputedMedian$TEAM_PITCHING_H)
## [1] -0.08895731
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H, dfTrain))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H, 
##     data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -66.999  -9.102   0.739  10.013  43.146 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      75.0829610  2.4592867  30.530  < 2e-16 ***
## TEAM_FIELDING_DP -0.0045343  0.0121655  -0.373    0.709    
## TEAM_PITCHING_H   0.0041845  0.0008319   5.030 5.34e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.85 on 1987 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.01377,    Adjusted R-squared:  0.01278 
## F-statistic: 13.87 on 2 and 1987 DF,  p-value: 1.038e-06
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H, dfTrain_ImputedMedian))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H, 
##     data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.237  -9.564   0.855  10.359  68.964 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      87.1139149  2.2975487  37.916  < 2e-16 ***
## TEAM_FIELDING_DP -0.0271240  0.0147930  -1.834   0.0669 .  
## TEAM_PITCHING_H  -0.0012921  0.0002317  -5.576 2.76e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2171 degrees of freedom
## Multiple R-squared:  0.01493,    Adjusted R-squared:  0.01402 
## F-statistic: 16.45 on 2 and 2171 DF,  p-value: 8.127e-08
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP*TEAM_PITCHING_H, dfTrain))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP * TEAM_PITCHING_H, 
##     data = dfTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -69.126  -9.261   1.004   9.713  47.202 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.023e+02  5.724e+00  17.872  < 2e-16 ***
## TEAM_FIELDING_DP                 -2.549e-01  4.914e-02  -5.188 2.35e-07 ***
## TEAM_PITCHING_H                  -1.244e-02  3.269e-03  -3.806 0.000145 ***
## TEAM_FIELDING_DP:TEAM_PITCHING_H  1.561e-04  2.970e-05   5.257 1.62e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.76 on 1986 degrees of freedom
##   (286 observations deleted due to missingness)
## Multiple R-squared:  0.02731,    Adjusted R-squared:  0.02584 
## F-statistic: 18.59 on 3 and 1986 DF,  p-value: 6.864e-12
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP*TEAM_PITCHING_H, dfTrain_ImputedMedian))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP * TEAM_PITCHING_H, 
##     data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.162  -9.515   0.820  10.312  69.257 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       7.833e+01  5.757e+00  13.607   <2e-16 ***
## TEAM_FIELDING_DP                  3.302e-02  3.906e-02   0.845   0.3981    
## TEAM_PITCHING_H                   3.513e-03  2.898e-03   1.212   0.2256    
## TEAM_FIELDING_DP:TEAM_PITCHING_H -3.328e-05  2.001e-05  -1.663   0.0964 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2170 degrees of freedom
## Multiple R-squared:  0.01618,    Adjusted R-squared:  0.01482 
## F-statistic:  11.9 on 3 and 2170 DF,  p-value: 9.984e-08

The interaction temr makes a difference.

Taking a log of Pitching_H:

ggplot(dfTrain_ImputedMedian, aes(dfTrain_ImputedMedian$TEAM_PITCHING_H)) +
  geom_histogram(bins=100)
## Warning: Use of `dfTrain_ImputedMedian$TEAM_PITCHING_H` is discouraged. Use
## `TEAM_PITCHING_H` instead.

dfTrain_ImputedMedian5 <- dfTrain_ImputedMedian2 %>%
  mutate(logPitch_h = TEAM_PITCHING_H^2)

ggplot(dfTrain_ImputedMedian5, aes(logPitch_h, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ TEAM_PITCHING_H + logPitch_h, dfTrain_ImputedMedian5)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H + logPitch_h, data = dfTrain_ImputedMedian5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -63.631  -9.694   1.045  10.242  64.174 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.944e+01  9.013e-01  88.133  < 2e-16 ***
## TEAM_PITCHING_H  1.126e-03  5.376e-04   2.094   0.0364 *  
## logPitch_h      -1.313e-07  2.682e-08  -4.897 1.05e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.39 on 2171 degrees of freedom
## Multiple R-squared:  0.02418,    Adjusted R-squared:  0.02328 
## F-statistic:  26.9 on 2 and 2171 DF,  p-value: 2.895e-12
plot(m)

A closer look at Pitching_h. Taking out th outliers.

dfTrain_ImputedMedian6 <- dfTrain_ImputedMedian5 %>%
  dplyr::filter(TEAM_PITCHING_H <= 1500)

dfTrain_ImputedMedian7 <- dfTrain_ImputedMedian5 %>%
  dplyr::filter(TEAM_PITCHING_H > 2000)

ggplot(dfTrain_ImputedMedian6, aes(TEAM_PITCHING_H, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian6)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.864  -8.396   0.413   8.870  30.267 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.26774    8.46728   0.976    0.329    
## TEAM_PITCHING_H  0.04990    0.00602   8.289 3.78e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.22 on 970 degrees of freedom
## Multiple R-squared:  0.06614,    Adjusted R-squared:  0.06518 
## F-statistic:  68.7 on 1 and 970 DF,  p-value: 3.785e-16
plot(m)

ggplot(dfTrain_ImputedMedian7, aes(TEAM_PITCHING_H, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian7)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.879 -13.887   2.392  15.885  65.947 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     90.487384   2.180193   41.50  < 2e-16 ***
## TEAM_PITCHING_H -0.002207   0.000418   -5.28 2.77e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.87 on 255 degrees of freedom
## Multiple R-squared:  0.09855,    Adjusted R-squared:  0.09502 
## F-statistic: 27.88 on 1 and 255 DF,  p-value: 2.767e-07
plot(m)

ggplot(dfTrain_ImputedMedian, aes(TEAM_PITCHING_H, TARGET_WINS)) +
          stat_smooth(method=loess) +
          geom_point()
## `geom_smooth()` using formula 'y ~ x'

m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.165  -9.462   0.897  10.651  68.914 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     83.0150688  0.5308401 156.384  < 2e-16 ***
## TEAM_PITCHING_H -0.0012543  0.0002309  -5.432  6.2e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared:  0.0134, Adjusted R-squared:  0.01295 
## F-statistic:  29.5 on 1 and 2172 DF,  p-value: 6.205e-08
plot(m)

Eliminting outliers has no effect - but show outliers seem to be grouped (compare new outliers with old):

dfTrain_ImputedMedian_nooutliers <- dfTrain_ImputedMedian %>%
  dplyr::filter(INDEX != 1211 & INDEX != 1342 & INDEX != 1810)

m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian_nooutliers)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian_nooutliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.170  -9.460   0.889  10.636  68.905 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     83.0181857  0.5306250  156.45  < 2e-16 ***
## TEAM_PITCHING_H -0.0012530  0.0002307   -5.43 6.26e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.46 on 2169 degrees of freedom
## Multiple R-squared:  0.01341,    Adjusted R-squared:  0.01296 
## F-statistic: 29.49 on 1 and 2169 DF,  p-value: 6.263e-08
plot(m)

looking for interactions:

par(mfcol=c(2,2))

dfTrain_ImputedMedian8 <- dfTrain_ImputedMedian %>%
  mutate(Pitch_h_Under1500 = ifelse(TEAM_PITCHING_H<=1500, 1, 0))

dfTrain_ImputedMedian8$Pitch_h_Under1500 <- as.factor(dfTrain_ImputedMedian8$Pitch_h_Under1500)

for(i in 2:ncol(dfTrain_ImputedMedian8)) {                              
  print(ggplot(dfTrain_ImputedMedian8, aes(dfTrain_ImputedMedian8[ ,i], TARGET_WINS, color=Pitch_h_Under1500)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  ggtitle(colnames(dfTrain_ImputedMedian8)[i]))
}
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

Similar analysis with the data missing records:

dfTrain_flag <- dfTrain2 %>%
  mutate(Missing_Flag = ifelse(is.na(TEAM_BATTING_SO),1,0))

mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_flag)
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP + 
##     TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_E + TEAM_FIELDING_DP, data = dfTrain_flag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.2248  -5.6294  -0.0212   5.0439  21.3065 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      60.95454   19.10292   3.191 0.001670 ** 
## TEAM_BATTING_H    0.02541    0.01009   2.518 0.012648 *  
## TEAM_BATTING_HBP  0.08712    0.04852   1.796 0.074211 .  
## TEAM_PITCHING_HR  0.08945    0.02394   3.736 0.000249 ***
## TEAM_PITCHING_BB  0.05672    0.00940   6.034 8.66e-09 ***
## TEAM_PITCHING_SO -0.03136    0.00728  -4.308 2.68e-05 ***
## TEAM_FIELDING_E  -0.17218    0.03970  -4.338 2.38e-05 ***
## TEAM_FIELDING_DP -0.11904    0.03516  -3.386 0.000869 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.422 on 183 degrees of freedom
##   (2080 observations deleted due to missingness)
## Multiple R-squared:  0.5345, Adjusted R-squared:  0.5167 
## F-statistic: 30.02 on 7 and 183 DF,  p-value: < 2.2e-16

Only interaction appears with the fielding_errors. Hwoever, If we interact with itself it greatly improves the r squared.

dfTrain_ImputedMedian9 <- dfTrain_ImputedMedian8 %>%
  mutate(Pitch_h_squared = TEAM_PITCHING_H^2) %>%
    mutate(Pitch_h_log = log(TEAM_PITCHING_H)) %>%
    mutate(Pitch_h_sqrt = sqrt(TEAM_PITCHING_H))

summary(lm(TARGET_WINS ~ Pitch_h_squared, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_squared, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.015  -9.069   0.997  10.158  66.609 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.119e+01  3.359e-01 241.736  < 2e-16 ***
## Pitch_h_squared -8.054e-08  1.147e-08  -7.024 2.88e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.4 on 2172 degrees of freedom
## Multiple R-squared:  0.02221,    Adjusted R-squared:  0.02176 
## F-statistic: 49.33 on 1 and 2172 DF,  p-value: 2.883e-12
summary(lm(TARGET_WINS ~ Pitch_h_log, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_log, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -78.408  -9.582   1.145  10.356  66.161 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  87.2807     7.9389  10.994   <2e-16 ***
## Pitch_h_log  -0.8795     1.0706  -0.822    0.411    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared:  0.0003106,  Adjusted R-squared:  -0.0001496 
## F-statistic: 0.6749 on 1 and 2172 DF,  p-value: 0.4114
summary(lm(TARGET_WINS ~ Pitch_h_sqrt, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_sqrt, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -67.753  -9.477   0.982  10.732  68.378 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  85.48013    1.47144   58.09  < 2e-16 ***
## Pitch_h_sqrt -0.11429    0.03474   -3.29  0.00102 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.54 on 2172 degrees of freedom
## Multiple R-squared:  0.00496,    Adjusted R-squared:  0.004501 
## F-statistic: 10.83 on 1 and 2172 DF,  p-value: 0.001017
m <- lm(TARGET_WINS ~ TEAM_PITCHING_H*Pitch_h_Under1500, dfTrain_ImputedMedian8)
summary(m)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H * Pitch_h_Under1500, 
##     data = dfTrain_ImputedMedian8)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -60.864  -9.153   0.979   9.772  67.940 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         8.643e+01  6.550e-01 131.965  < 2e-16 ***
## TEAM_PITCHING_H                    -1.771e-03  2.322e-04  -7.628 3.55e-14 ***
## Pitch_h_Under15001                 -7.816e+01  1.047e+01  -7.466 1.19e-13 ***
## TEAM_PITCHING_H:Pitch_h_Under15001  5.167e-02  7.432e-03   6.952 4.76e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.08 on 2170 degrees of freedom
## Multiple R-squared:  0.06361,    Adjusted R-squared:  0.06232 
## F-statistic: 49.14 on 3 and 2170 DF,  p-value: < 2.2e-16
plot(m)

summary(lm(TARGET_WINS ~ TEAM_FIELDING_E*Pitch_h_Under1500, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E * Pitch_h_Under1500, 
##     data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.182  -9.571   0.598   9.826  73.499 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                        87.867745   0.643380 136.572  < 2e-16 ***
## TEAM_FIELDING_E                    -0.016158   0.001498 -10.787  < 2e-16 ***
## Pitch_h_Under15001                 -0.776515   1.469068  -0.529    0.597    
## TEAM_FIELDING_E:Pitch_h_Under15001 -0.042078   0.008364  -5.031 5.28e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.88 on 2170 degrees of freedom
## Multiple R-squared:  0.08892,    Adjusted R-squared:  0.08766 
## F-statistic: 70.59 on 3 and 2170 DF,  p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ TEAM_FIELDING_E, dfTrain_ImputedMedian9))
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E, data = dfTrain_ImputedMedian9)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -61.638  -9.847   0.708  10.050  73.590 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     83.645750   0.476605 175.503   <2e-16 ***
## TEAM_FIELDING_E -0.011815   0.001415  -8.352   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared:  0.03112,    Adjusted R-squared:  0.03067 
## F-statistic: 69.75 on 1 and 2172 DF,  p-value: < 2.2e-16

Final Mods:

dfTrain_ImputedMedian8$Pitch_h_Under1500 <- as.numeric(dfTrain_ImputedMedian8$Pitch_h_Under1500)

dfTrain_ImputedMedian10 <- dfTrain_ImputedMedian8 %>%
  mutate(Prod_DP_H = TEAM_FIELDING_DP*TEAM_PITCHING_H) %>%
  mutate(inter_H_Itself = TEAM_PITCHING_H*Pitch_h_Under1500) %>%
  mutate(Inter_H_Err = TEAM_FIELDING_E*Pitch_h_Under1500) %>%
  mutate(TEAM_PITCHING_H = log(TEAM_PITCHING_H)) %>%
  mutate(E_sq = TEAM_FIELDING_E^2)

mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
summary(mod_2)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -50.264  -8.466   0.163   8.273  58.924 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      23.9560970  5.4876280   4.365 1.33e-05 ***
## INDEX            -0.0004771  0.0003788  -1.259 0.207988    
## TEAM_BATTING_H    0.0482928  0.0037112  13.013  < 2e-16 ***
## TEAM_BATTING_2B  -0.0232530  0.0092311  -2.519 0.011841 *  
## TEAM_BATTING_3B   0.0595670  0.0169134   3.522 0.000437 ***
## TEAM_BATTING_HR   0.0655424  0.0272468   2.406 0.016234 *  
## TEAM_BATTING_BB   0.0084691  0.0057882   1.463 0.143567    
## TEAM_BATTING_SO  -0.0100510  0.0025721  -3.908 9.61e-05 ***
## TEAM_BASERUN_SB   0.0254437  0.0044746   5.686 1.47e-08 ***
## TEAM_BASERUN_CS   0.0006521  0.0161429   0.040 0.967780    
## TEAM_PITCHING_H  -0.0009865  0.0003651  -2.702 0.006949 ** 
## TEAM_PITCHING_HR  0.0116273  0.0240289   0.484 0.628514    
## TEAM_PITCHING_BB  0.0014808  0.0040999   0.361 0.718000    
## TEAM_PITCHING_SO  0.0028141  0.0009069   3.103 0.001941 ** 
## TEAM_FIELDING_E  -0.0186779  0.0024906  -7.499 9.31e-14 ***
## TEAM_FIELDING_DP -0.1091373  0.0136377  -8.003 1.97e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.86 on 2158 degrees of freedom
## Multiple R-squared:  0.3226, Adjusted R-squared:  0.3179 
## F-statistic:  68.5 on 15 and 2158 DF,  p-value: < 2.2e-16
par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian10)
summary(mod_2)
## 
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian10)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.102  -8.210   0.314   8.312  61.874 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.650e+02  3.613e+01  -4.567 5.22e-06 ***
## INDEX             -4.911e-04  3.746e-04  -1.311 0.189965    
## TEAM_BATTING_H     3.562e-02  4.340e-03   8.208 3.82e-16 ***
## TEAM_BATTING_2B   -2.155e-02  9.260e-03  -2.327 0.020033 *  
## TEAM_BATTING_3B    8.171e-02  1.760e-02   4.644 3.63e-06 ***
## TEAM_BATTING_HR    1.203e-01  3.066e-02   3.925 8.95e-05 ***
## TEAM_BATTING_BB    2.204e-02  6.083e-03   3.622 0.000299 ***
## TEAM_BATTING_SO   -1.028e-02  2.686e-03  -3.827 0.000133 ***
## TEAM_BASERUN_SB    2.739e-02  4.636e-03   5.908 4.02e-09 ***
## TEAM_BASERUN_CS    5.357e-03  1.621e-02   0.330 0.741104    
## TEAM_PITCHING_H    2.612e+01  5.215e+00   5.008 5.95e-07 ***
## TEAM_PITCHING_HR  -4.297e-02  2.757e-02  -1.559 0.119245    
## TEAM_PITCHING_BB  -5.488e-03  4.231e-03  -1.297 0.194696    
## TEAM_PITCHING_SO   2.273e-03  9.513e-04   2.389 0.016979 *  
## TEAM_FIELDING_E    7.044e-03  1.113e-02   0.633 0.526826    
## TEAM_FIELDING_DP  -8.680e-02  3.424e-02  -2.535 0.011307 *  
## Pitch_h_Under1500  1.014e+01  4.512e+00   2.246 0.024785 *  
## Prod_DP_H         -6.911e-06  1.826e-05  -0.379 0.705076    
## inter_H_Itself    -2.000e-03  2.911e-03  -0.687 0.492098    
## Inter_H_Err       -2.725e-02  8.008e-03  -3.403 0.000678 ***
## E_sq              -7.588e-06  4.520e-06  -1.679 0.093370 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.7 on 2153 degrees of freedom
## Multiple R-squared:  0.3408, Adjusted R-squared:  0.3347 
## F-statistic: 55.66 on 20 and 2153 DF,  p-value: < 2.2e-16
plot(mod_2)

step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
## 
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B + 
##     TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO + 
##     TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO + 
##     TEAM_FIELDING_DP + Pitch_h_Under1500 + inter_H_Itself + Inter_H_Err + 
##     E_sq, data = dfTrain_ImputedMedian10)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.562  -8.169   0.338   8.309  61.515 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.647e+02  3.089e+01  -5.331 1.08e-07 ***
## TEAM_BATTING_H     3.681e-02  4.250e-03   8.660  < 2e-16 ***
## TEAM_BATTING_2B   -2.174e-02  9.232e-03  -2.355 0.018605 *  
## TEAM_BATTING_3B    8.082e-02  1.730e-02   4.671 3.18e-06 ***
## TEAM_BATTING_HR    1.350e-01  2.803e-02   4.818 1.55e-06 ***
## TEAM_BATTING_BB    1.541e-02  3.369e-03   4.575 5.03e-06 ***
## TEAM_BATTING_SO   -9.744e-03  2.630e-03  -3.705 0.000217 ***
## TEAM_BASERUN_SB    2.693e-02  4.281e-03   6.291 3.81e-10 ***
## TEAM_PITCHING_H    2.610e+01  4.390e+00   5.946 3.20e-09 ***
## TEAM_PITCHING_HR  -5.769e-02  2.526e-02  -2.284 0.022461 *  
## TEAM_PITCHING_SO   1.612e-03  7.791e-04   2.068 0.038723 *  
## TEAM_FIELDING_DP  -9.966e-02  1.364e-02  -7.304 3.91e-13 ***
## Pitch_h_Under1500  1.138e+01  1.510e+00   7.532 7.28e-14 ***
## inter_H_Itself    -3.339e-03  5.513e-04  -6.057 1.63e-09 ***
## Inter_H_Err       -2.277e-02  4.433e-03  -5.136 3.05e-07 ***
## E_sq              -5.730e-06  3.596e-06  -1.593 0.111234    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.7 on 2158 degrees of freedom
## Multiple R-squared:  0.3396, Adjusted R-squared:  0.335 
## F-statistic: 73.97 on 15 and 2158 DF,  p-value: < 2.2e-16

Checking interactions with the missing vaolues cohort:

looking for interactions:

par(mfcol=c(2,2))

dfTrain_ImputedMean$Missing_Flag <- as.factor(dfTrain_ImputedMean$Missing_Flag)


for(i in 2:ncol(dfTrain_ImputedMean)) {                              
  print(ggplot(dfTrain_ImputedMean, aes(dfTrain_ImputedMean[ ,i], TARGET_WINS, color=Missing_Flag)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  ggtitle(colnames(dfTrain_ImputedMean)[i]))

}
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'